####################################
#Preparing the expression table
####################################
## Set working directory for first batch of data and import rsem data ##
dir = "/sc/hydra/projects/pd-omics/mynd/data/PBG-2097"
samples = read.table("/sc/hydra/projects/pd-omics/mynd/qc/samples.batch1", header = F, check.names = F)
sample = samples$V1
files = file.path(dir, sample, "Processed/RAPiD/rsem", paste0(sample, ".genes.results"))
names(files) = sample
txi.rsem = tximport(files, type = "rsem")
counts1 = data.frame(txi.rsem$counts, check.names = F)
tpms1 = data.frame(txi.rsem$abundance, check.names = F)
## Repeat for the second batch of data ##
dir = "/sc/hydra/projects/pd-omics/mynd/data/PBG-2022"
samples = read.table("/sc/hydra/projects/pd-omics/mynd/qc/samples.batch2", header = F, check.names = F)
sample = samples$V1
files = file.path(dir, sample, "Processed/RAPiD/rsem", paste0(sample, ".genes.results"))
names(files) = sample
txi.rsem1 = tximport(files, type = "rsem")
counts2 = data.frame(txi.rsem1$counts, check.names = F)
tpms2 = data.frame(txi.rsem1$abundance, check.names = F)
## We need to change sample names because of overlap between batches. We will append the batch name to the sample ID in the count matrix and then merge the two datasets ##
colnames(counts1) <- paste(colnames(counts1), "1", sep = "_")
colnames(tpms1) <- paste(colnames(tpms1), "1", sep = "_")
colnames(counts2) <- paste(colnames(counts2), "2", sep = "_")
colnames(tpms2) <- paste(colnames(tpms2), "2", sep = "_")
dim(counts1)[1] 58929 216
[1] 58929 121
[1] 58929 216
[1] 58929 121
tot_counts = merge(counts1, counts2, by = 0)
rownames(tot_counts) = tot_counts$Row.names
tot_counts$Row.names = NULL
tot_tpms = merge(tpms1, tpms2, by = 0)
rownames(tot_tpms) = tot_tpms$Row.names
tot_tpms$Row.names = NULL
dim(tot_tpms)[1] 58929 337
[1] 58929 337
counts = tot_counts
tpms = tot_tpms
## We need to then do the same for metadata, but batch 1 and 2 are rnaseq batch 1, and batch 3 here, is batch2 ... ##
mynd_metadata = read.table('/sc/hydra/projects/pd-omics/mynd/qc/tmp.meta', header = T, check.names = F)
rownames(mynd_metadata) = paste(mynd_metadata$Sample, mynd_metadata$Batch, sep = "_")
rownames(mynd_metadata) = gsub("_2", "_1", rownames(mynd_metadata))
rownames(mynd_metadata) = gsub("_3", "_2", rownames(mynd_metadata))
rownames(mynd_metadata) %in% colnames(tpms)[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [57] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [71] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [99] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [113] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [127] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [141] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [155] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [183] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [197] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [225] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [239] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [253] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [267] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [281] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [295] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [309] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [323] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [337] TRUE
# You don't need to run everythin again, we can load the RData from here!
#load(paste0(work_dir, "mynd.expression.RData"))
### Removing genes with low expression
### >= 30%
x = data.frame(counts, check.names = F)
cpm = cpm(x)
keep.exp = rowSums(cpm > 1) >= 0.3*ncol(x)
#These are the final tables of expression!
mynd_cpm = cpm[keep.exp, ]
mynd_abundance = tpms[keep.exp,]
mynd_counts = counts[keep.exp,]
# Normalization
norm = calcNormFactors(mynd_counts, method = "TMM") #normalized with TMM
#Creat Limma object
x <- DGEList(counts=mynd_counts, samples=mynd_metadata, norm.factors = norm)
v = voom(x)##Exploratory Analysis ### Library sizes
[1] 337
#We can also plot the library sizes as a barplot to see whether there are any major discrepancies between the samples more easily.
#png(paste0(work_plots, "library_sizes.#png"), width = 16, height = 10, res = 300, units = "in")
barplot(x$samples$lib.size, names=colnames(x), las=2, cex.axis = 0.8)
title("Barplot of library sizes")
#dev.off()# To do the MDS plot is necessary to check the order for the colors.
# Because of this we are using the ordered_metadata.
#plotMDS(x)
#png(paste0(work_plots, "MDS.#png"), width = 8, height = 6, res = 300, units = "in")
col.group <- mynd_metadata$Batch
levels(col.group) <- brewer.pal(nlevels(col.group), "Set1")
col.group <- as.character(col.group)
plotMDS(mynd_voom, col= col.group)
title("MDS by Batch")
getResiduals <- function(exprLine) {
ourCovRegression <- lm(exprLine ~ mynd_metadata$Batch + mynd_metadata$Gender + mynd_metadata$Age);
ourResiduals <- ourCovRegression$residuals;
return(ourResiduals)
}
allResiduals <- data.frame(t(apply(mynd_voom, 1, getResiduals)), check.names = F)
resid = as.matrix(allResiduals)
#png(paste0(work_plots, "MDSreg.png"), width = 8, height = 6, res = 300, units = "in")
plotMDS(resid, col= col.group)
title("MDS by Status")
#dev.off()#PCA
mynd_voom_t = t(mynd_voom)
#png(paste0(work_plots, "PCA_status.#png"), width = 8, height = 6, res = 300, units = "in")
autoplot(prcomp(mynd_voom_t), data = mynd_metadata, colour = "Batch")
cov = t(resid)
autoplot(prcomp(cov), data = mynd_metadata, colour = "Batch")
#dev.off()#Only 2 samples
#png(paste0(work_plots, "Scatter_all.#png"), width = 8, height = 6, res = 300, units = "in")
par( mfrow = c( 2,2 ))
plot(log2(mynd_counts[,1:2] + 1), pch=16, cex=0.3, main = "COUNTS")
plot(log2(mynd_cpm[,1:2] + 1), pch=16, cex=0.3, main = "CPM")
plot(log2(mynd_abundance[,1:2] + 1), pch=16, cex=0.3, main = "TPM")
plot(mynd_voom[,1:2], pch=16, cex=0.3, main = "VOOM")
#dev.off()#Density plot with all samples
# cpm table = ppmi_cpm_case_control
# log cpm table = lcpm
lcounts = log2(mynd_counts)
lcpm = log2(mynd_cpm)
ltpm = log2(mynd_abundance)
# voom use log alread
nsamples <- ncol(mynd_cpm)
#samplenames <- colnames(mynd_cpm)
colfunc <- colorRampPalette(c("#4DBBD5FF", "#3C5488FF"))
col = alpha(colfunc(nsamples), alpha = 0.1)
#png(paste0(work_plots, "Density_all.#png"), width = 8, height = 6, res = 300, units = "in")
par(mfrow=c(2,2))
#COUNTS
plot(density(lcounts[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="A. COUNTS ", xlab="Log2(counts)")
#abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcounts[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#CPM
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. CPM ", xlab="Log2(CPM)")
#abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#TPM
plot(density(ltpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="C. TPM ", xlab="Log2(TPM)")
#abline(v=ltpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(ltpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#Voom
plot(density(mynd_voom[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="D. VOOM ", xlab="voom")
#abline(v=lvoom.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(mynd_voom[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#dev.off()###Boxplot with log + 1 Only with few samples ###
#png(paste0(work_plots, "Boxplot_all.#png"), width = 8, height = 6, res = 300, units = "in")
#Counts
boxplot(log2(mynd_counts +1), col=rainbow(20), main="Counts", ylab="Log2(Counts+1)",las=2,cex.axis=0.8)
#CPM
boxplot(log2(mynd_cpm +1), col=rainbow(20), main="CPM", ylab="Log2(CPM+1)",las=2,cex.axis=0.8)
#TPM
boxplot(log2(mynd_abundance +1), col=rainbow(20), main="TPM", ylab="Log2(TPM+1)",las=2,cex.axis=0.8)
#Voom
boxplot(mynd_voom, col=rainbow(20), main="Voom", ylab="Voom",las=2,cex.axis=0.8)
#dev.off()###Heatmaps NOT GOOD FOR 600 samples ### 8 min CHECK THE BAR COLORS
par(mfrow=c(1,1))
#Spearman
col.cell <- c('grey','black')[mynd_metadata$Status]
cormatrix = rcorr(as.matrix(mynd_voom), type='spearman')
corrdata = as.matrix(cormatrix$r)
#png(paste0(work_plots, "HM_Spearman.#png"), width = 20, height = 14, res = 300, units = "in")
heatmap.2(corrdata, main="Voom Spearman",ColSideColors = col.cell, trace="none", col = c(sort(brewer.pal(9,"Blues")),brewer.pal(9,"Reds")), margins = c(1,1))
#dev.off()#Pearson
cormatrix = rcorr(as.matrix(mynd_voom), type='pearson')
corrdata = as.matrix(cormatrix$r)
#png(paste0(work_plots, "HM_Pearson.#png"), width = 20, height = 14, res = 300, units = "in")
heatmap.2(corrdata, main="Voom Pearson", ColSideColors = col.cell,trace="none", col = c(sort(brewer.pal(9,"Blues")),brewer.pal(9,"Reds")), margins = c(1,1))
#dev.off()#Euclidean distance with the transposed matrix
sampleTree = hclust(dist(mynd_voom_t), method = "average")
#png(paste0(work_plots, "tree_euclidean.#png"), width = 20, height = 10, res = 300, units = "in")
plot(sampleTree, main = "Sample clustering to detecting outliers",
sub = "Function: hclust, Method: average from Euclidean distance", xlab = "samples",
cex.lab = 1.5, cex.axis = 1.5, cex.main = 1.5, cex.sub = 1.5)
#dev.off()#Spearman correlation #10 min
#png(paste0(work_plots, "tree_Spearman.#png"), width = 20, height = 10, res = 300, units = "in")
#plot(hcluster(mynd_voom_t, method= "spearman", link="average" ),
# cex.lab = 1.5, cex.axis = 1.5, cex.main = 2, main= "Hierarchical Clustering",
# sub = "Function: hcluster, Method: Spearman", xlab = "samples")
#dev.off()total = merge(mynd_voom_t, mynd_metadata, by = 0)
total$mismatch = ""
labelIndicesWeWant <- total$Gender == "M" & total$ENSG00000229807.11 > 5 | total$Gender == "F" & total$ENSG00000183878.15 > 0
total$mismatch[labelIndicesWeWant] <- as.character(total$Row.names[labelIndicesWeWant])
p = ggplot(total, aes(x = ENSG00000229807.11, y = ENSG00000183878.15, color = Gender, label = mismatch))
p + geom_point() + geom_text_repel() + xlab("XIST") + ylab("UTY")#png( "cell_marker_genes.#png", width = 22, height = 11, res = 300, units = "in")
my_palette <- colorRampPalette(c('Light Blue', "White", "Red"))
library(gplots)
markers = read.table('/sc/hydra/projects/pd-omics/mynd/qc/marker.genes', header = F)
total = merge(markers, mynd_abundance, by.y = 0, by.x = 'V1', sort = F)
rownames(total) = total$V2
total$V1 = NULL
total$V2 = NULL
total$Row.names = NULL
total = as.matrix(log2(total + 1))
heatmap.2(total, dendrogram="both", Rowv =F, Colv=T, cexCol = .4, trace="none", scale = "none", col=my_palette, adjRow = c(0, 0.5), margins = c(6, 8), key = TRUE, keysize = 1.3, xlab = "Samples", ylab = "Genes", main = "Expression")
#dev.off()